# IMPORT LIBRARIES
# PROJECT DATASET FOUND AT https://ourworldindata.org/covid-deaths*
# * ORIGINAL SOURCE AND RAW DATA FOUND AT
# COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at John Hopkins University/
# JHU CSSE COVID-19 DATA - Located at https://github.com/CSSEGISandData/COVID-19
import pandas as pd
import numpy as np
from datetime import date
pd.set_option('display.max_rows',150)
pd.options.mode.chained_assignment = None
import plotly.express as px
import plotly.graph_objects as go
# IMPORT DATASET
covid_datadf = pd.read_csv(r'C:\Users\jksmi\Downloads\master_covid_data.csv')
# OBSERVE DATASET
covid_datadf
| iso_code | continent | location | date | population | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | ... | female_smokers | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2/24/2020 | 39835428.0 | 5.0 | 5.0 | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 1 | AFG | Asia | Afghanistan | 2/25/2020 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 2 | AFG | Asia | Afghanistan | 2/26/2020 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 3 | AFG | Asia | Afghanistan | 2/27/2020 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 4 | AFG | Asia | Afghanistan | 2/28/2020 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 138722 | ZWE | Africa | Zimbabwe | 12/4/2021 | 15092171.0 | 138523.0 | 1082.0 | 669.571 | 4709.0 | 1.0 | ... | 1.6 | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN |
| 138723 | ZWE | Africa | Zimbabwe | 12/5/2021 | 15092171.0 | 139046.0 | 523.0 | 727.857 | 4710.0 | 1.0 | ... | 1.6 | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN |
| 138724 | ZWE | Africa | Zimbabwe | 12/6/2021 | 15092171.0 | 139046.0 | 0.0 | 688.571 | 4710.0 | 0.0 | ... | 1.6 | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN |
| 138725 | ZWE | Africa | Zimbabwe | 12/7/2021 | 15092171.0 | 141601.0 | 2555.0 | 996.571 | 4713.0 | 3.0 | ... | 1.6 | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN |
| 138726 | ZWE | Africa | Zimbabwe | 12/8/2021 | 15092171.0 | 150628.0 | 9027.0 | 2184.429 | 4720.0 | 7.0 | ... | 1.6 | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN |
138727 rows × 67 columns
covid_datadf.describe()
| population | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | new_deaths_smoothed | total_cases_per_million | new_cases_per_million | new_cases_smoothed_per_million | ... | female_smokers | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.377390e+05 | 1.310000e+05 | 130994.000000 | 129954.000000 | 1.196660e+05 | 119862.000000 | 129954.000000 | 130328.000000 | 130323.000000 | 129288.000000 | ... | 93274.000000 | 91928.000000 | 60278.000000 | 108801.000000 | 128970.000000 | 119877.000000 | 4894.000000 | 4894.000000 | 4894.000000 | 4894.000000 |
| mean | 1.568339e+08 | 2.180052e+06 | 8535.869185 | 8541.670343 | 5.305418e+04 | 182.800988 | 167.866251 | 21307.046606 | 90.895441 | 90.728166 | ... | 10.589385 | 32.754137 | 50.839891 | 3.028244 | 73.262089 | 0.726055 | 33989.159706 | 9.265480 | 16.417652 | 843.997485 |
| std | 7.269739e+08 | 1.237652e+07 | 44427.334924 | 43803.864608 | 2.716035e+05 | 865.220373 | 812.301657 | 35035.087686 | 214.386916 | 181.142618 | ... | 10.502004 | 13.517123 | 31.817432 | 2.454183 | 7.528847 | 0.150018 | 95064.788868 | 17.389516 | 31.751464 | 1253.396483 |
| min | 4.700000e+01 | 0.000000e+00 | -74347.000000 | -6223.000000 | 1.000000e+00 | -1918.000000 | -232.143000 | 0.001000 | -3125.829000 | -272.971000 | ... | 0.100000 | 7.700000 | 1.188000 | 0.100000 | 53.280000 | 0.394000 | -31959.400000 | -27.350000 | -95.920000 | -1728.844404 |
| 25% | 2.015490e+06 | 2.613000e+03 | 3.000000 | 10.571000 | 8.500000e+01 | 0.000000 | 0.143000 | 454.614750 | 0.326000 | 1.649000 | ... | 1.900000 | 21.600000 | 20.859000 | 1.300000 | 67.940000 | 0.602000 | -137.100000 | -0.857500 | -0.530000 | -38.187501 |
| 50% | 9.749625e+06 | 3.012450e+04 | 105.000000 | 131.429000 | 7.980000e+02 | 2.000000 | 2.000000 | 3518.864000 | 11.401000 | 16.187500 | ... | 6.300000 | 31.400000 | 49.839000 | 2.400000 | 74.620000 | 0.744000 | 2867.650000 | 5.550000 | 7.025000 | 390.200482 |
| 75% | 3.734479e+07 | 2.825198e+05 | 1099.000000 | 1159.143000 | 6.975500e+03 | 22.000000 | 18.857000 | 27440.095500 | 85.506000 | 95.657000 | ... | 19.300000 | 41.300000 | 83.241000 | 4.000000 | 78.740000 | 0.845000 | 22007.800000 | 13.980000 | 22.787500 | 1409.343556 |
| max | 7.874966e+09 | 2.678298e+08 | 908289.000000 | 827220.000000 | 5.279200e+06 | 18007.000000 | 14703.286000 | 253218.290000 | 8620.690000 | 3544.004000 | ... | 44.000000 | 78.100000 | 100.000000 | 13.800000 | 86.750000 | 0.957000 | 984309.300000 | 121.160000 | 373.940000 | 7270.553623 |
8 rows × 62 columns
# CHECK THE DATA TYPES FOR THE COLUMNS
covid_datadf.dtypes
iso_code object continent object location object date object population float64 total_cases float64 new_cases float64 new_cases_smoothed float64 total_deaths float64 new_deaths float64 new_deaths_smoothed float64 total_cases_per_million float64 new_cases_per_million float64 new_cases_smoothed_per_million float64 total_deaths_per_million float64 new_deaths_per_million float64 new_deaths_smoothed_per_million float64 reproduction_rate float64 icu_patients float64 icu_patients_per_million float64 hosp_patients float64 hosp_patients_per_million float64 weekly_icu_admissions float64 weekly_icu_admissions_per_million float64 weekly_hosp_admissions float64 weekly_hosp_admissions_per_million float64 new_tests float64 total_tests float64 total_tests_per_thousand float64 new_tests_per_thousand float64 new_tests_smoothed float64 new_tests_smoothed_per_thousand float64 positive_rate float64 tests_per_case float64 tests_units object total_vaccinations float64 people_vaccinated float64 people_fully_vaccinated float64 total_boosters float64 new_vaccinations float64 new_vaccinations_smoothed float64 total_vaccinations_per_hundred float64 people_vaccinated_per_hundred float64 people_fully_vaccinated_per_hundred float64 total_boosters_per_hundred float64 new_vaccinations_smoothed_per_million float64 new_people_vaccinated_smoothed float64 new_people_vaccinated_smoothed_per_hundred float64 stringency_index float64 population_density float64 median_age float64 aged_65_older float64 aged_70_older float64 gdp_per_capita float64 extreme_poverty float64 cardiovasc_death_rate float64 diabetes_prevalence float64 female_smokers float64 male_smokers float64 handwashing_facilities float64 hospital_beds_per_thousand float64 life_expectancy float64 human_development_index float64 excess_mortality_cumulative_absolute float64 excess_mortality_cumulative float64 excess_mortality float64 excess_mortality_cumulative_per_million float64 dtype: object
# THE ORIGINAL DATASET WILL BE SPLIT INTO TWO (1. covid_deaths, 2. covid_vaccinations). IN ORDER TO DO THIS, I WILL
# PARSE AND APPEND THE 'covid_datadf' DATAFRAME USING THE INDEX NUMBERS OF THE COLUMNS
# FIRST I WILL NEED TO OBTAIN AND DISPLAY THE INDEX NUMBERS OF THE COLUMNS IN THE DATAFRAME
# CREATE A FUNCTION THAT WILL GET THE INDEX LOCATION OF THE COLUMN FOR EVERY COLUMN FOR THE DATAFRAME THAT IS PASSED
def column_index(df):
for c in df.columns:
if c in df.columns:
df.columns.get_loc(c)
print(df.columns.get_loc(c),c)
# NOW TO CALL 'covid_column_index' FUNCTION
column_index(covid_datadf)
0 iso_code 1 continent 2 location 3 date 4 population 5 total_cases 6 new_cases 7 new_cases_smoothed 8 total_deaths 9 new_deaths 10 new_deaths_smoothed 11 total_cases_per_million 12 new_cases_per_million 13 new_cases_smoothed_per_million 14 total_deaths_per_million 15 new_deaths_per_million 16 new_deaths_smoothed_per_million 17 reproduction_rate 18 icu_patients 19 icu_patients_per_million 20 hosp_patients 21 hosp_patients_per_million 22 weekly_icu_admissions 23 weekly_icu_admissions_per_million 24 weekly_hosp_admissions 25 weekly_hosp_admissions_per_million 26 new_tests 27 total_tests 28 total_tests_per_thousand 29 new_tests_per_thousand 30 new_tests_smoothed 31 new_tests_smoothed_per_thousand 32 positive_rate 33 tests_per_case 34 tests_units 35 total_vaccinations 36 people_vaccinated 37 people_fully_vaccinated 38 total_boosters 39 new_vaccinations 40 new_vaccinations_smoothed 41 total_vaccinations_per_hundred 42 people_vaccinated_per_hundred 43 people_fully_vaccinated_per_hundred 44 total_boosters_per_hundred 45 new_vaccinations_smoothed_per_million 46 new_people_vaccinated_smoothed 47 new_people_vaccinated_smoothed_per_hundred 48 stringency_index 49 population_density 50 median_age 51 aged_65_older 52 aged_70_older 53 gdp_per_capita 54 extreme_poverty 55 cardiovasc_death_rate 56 diabetes_prevalence 57 female_smokers 58 male_smokers 59 handwashing_facilities 60 hospital_beds_per_thousand 61 life_expectancy 62 human_development_index 63 excess_mortality_cumulative_absolute 64 excess_mortality_cumulative 65 excess_mortality 66 excess_mortality_cumulative_per_million
# NOW THAT THE COLUMN INDEXES ARE KNOWN, THE 'covid_deaths' AND 'covid_vaccinations' DATAFRAMES CAN BE
# CREATED USING THE RANGE FUNCTION
# FIRST, CREATE THE 'covid_deaths' DATAFRAME
covid_deaths = covid_datadf.iloc[:,0:26]
# NOW TO TAKE A LOOK AT THE NEWLY CREATED 'covid_deaths' DATAFRAME
covid_deaths
| iso_code | continent | location | date | population | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | ... | new_deaths_smoothed_per_million | reproduction_rate | icu_patients | icu_patients_per_million | hosp_patients | hosp_patients_per_million | weekly_icu_admissions | weekly_icu_admissions_per_million | weekly_hosp_admissions | weekly_hosp_admissions_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2/24/2020 | 39835428.0 | 5.0 | 5.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | AFG | Asia | Afghanistan | 2/25/2020 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | AFG | Asia | Afghanistan | 2/26/2020 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | AFG | Asia | Afghanistan | 2/27/2020 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | AFG | Asia | Afghanistan | 2/28/2020 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 138722 | ZWE | Africa | Zimbabwe | 12/4/2021 | 15092171.0 | 138523.0 | 1082.0 | 669.571 | 4709.0 | 1.0 | ... | 0.047 | 3.09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 138723 | ZWE | Africa | Zimbabwe | 12/5/2021 | 15092171.0 | 139046.0 | 523.0 | 727.857 | 4710.0 | 1.0 | ... | 0.047 | 3.04 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 138724 | ZWE | Africa | Zimbabwe | 12/6/2021 | 15092171.0 | 139046.0 | 0.0 | 688.571 | 4710.0 | 0.0 | ... | 0.038 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 138725 | ZWE | Africa | Zimbabwe | 12/7/2021 | 15092171.0 | 141601.0 | 2555.0 | 996.571 | 4713.0 | 3.0 | ... | 0.057 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 138726 | ZWE | Africa | Zimbabwe | 12/8/2021 | 15092171.0 | 150628.0 | 9027.0 | 2184.429 | 4720.0 | 7.0 | ... | 0.123 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
138727 rows × 26 columns
covid_deaths.columns.tolist()
['iso_code', 'continent', 'location', 'date', 'population', 'total_cases', 'new_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths', 'new_deaths_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients', 'icu_patients_per_million', 'hosp_patients', 'hosp_patients_per_million', 'weekly_icu_admissions', 'weekly_icu_admissions_per_million', 'weekly_hosp_admissions', 'weekly_hosp_admissions_per_million']
# NOW TO CREATE THE 'covid_vaccinations' DATAFRAME
# TWO TEMPORARY DATAFRAMES WILL USED TO ACCOMPLISH THIS AS I WILL NEED TO ACCOMPLISH THIS: THE FIRST ('covidtmp1')
# WITH COLUMNS 1-4 FROM THE 'covid_datadf' AND THE SECOND ('covidtmp2') WITH COLUMNS 26-66 FROM THE 'covid_datadf'
# THEN I WILL COMBINE (CONCAT) THE TWO TOGETHER TO CREATE THE 'covid_vaccinations' DATAFRAME
# NOW TO CREATE THE FIRST TEMPORARY DATAFRAME('covidtmp1')
covidtmp1 = covid_datadf.iloc[:,0:4]
# NOW TO CHECK THE 'covidtmp1' DATAFRAME TO MAKE SURE IT HAS THE NECESSARY COLUMNS
covidtmp1.columns
Index(['iso_code', 'continent', 'location', 'date'], dtype='object')
# NOW TO CHECK THE DATA IN THE 'covidtmp1' DATAFRAME
covidtmp1.head()
| iso_code | continent | location | date | |
|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2/24/2020 |
| 1 | AFG | Asia | Afghanistan | 2/25/2020 |
| 2 | AFG | Asia | Afghanistan | 2/26/2020 |
| 3 | AFG | Asia | Afghanistan | 2/27/2020 |
| 4 | AFG | Asia | Afghanistan | 2/28/2020 |
# NOW TO CREATE THE SECOND TEMPORARY DATAFRAME('covidtmp2')
covidtmp2 = covid_datadf.iloc[:,26:]
# NOW TO CHECK THE 'covidtmp2' DATAFRAME TO MAKE SURE IT HAS THE NECESSARY COLUMNS
covidtmp2.columns
Index(['new_tests', 'total_tests', 'total_tests_per_thousand',
'new_tests_per_thousand', 'new_tests_smoothed',
'new_tests_smoothed_per_thousand', 'positive_rate', 'tests_per_case',
'tests_units', 'total_vaccinations', 'people_vaccinated',
'people_fully_vaccinated', 'total_boosters', 'new_vaccinations',
'new_vaccinations_smoothed', 'total_vaccinations_per_hundred',
'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred',
'total_boosters_per_hundred', 'new_vaccinations_smoothed_per_million',
'new_people_vaccinated_smoothed',
'new_people_vaccinated_smoothed_per_hundred', 'stringency_index',
'population_density', 'median_age', 'aged_65_older', 'aged_70_older',
'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate',
'diabetes_prevalence', 'female_smokers', 'male_smokers',
'handwashing_facilities', 'hospital_beds_per_thousand',
'life_expectancy', 'human_development_index',
'excess_mortality_cumulative_absolute', 'excess_mortality_cumulative',
'excess_mortality', 'excess_mortality_cumulative_per_million'],
dtype='object')
# NOW TO CHECK THE DATA IN THE 'covidtmp2' DATAFRAME
covidtmp2.head()
| new_tests | total_tests | total_tests_per_thousand | new_tests_per_thousand | new_tests_smoothed | new_tests_smoothed_per_thousand | positive_rate | tests_per_case | tests_units | total_vaccinations | ... | female_smokers | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
5 rows × 41 columns
# NOW TO COMBINE (CONCAT) THE 'covidtmp1' AND 'covidtmp2' DATAFRAME TO CREATE THE 'covid_vaccinations' DATAFRAME
covid_vaccinations = pd.concat([covidtmp1,covidtmp2], axis=1)
# NOW TO CHECK THE NEWLY CREATED 'covid_vaccinations' DATAFRAME TO MAKE SURE IT HAS THE INTENDED COLUMNS
covid_vaccinations.columns.tolist()
['iso_code', 'continent', 'location', 'date', 'new_tests', 'total_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'new_tests_smoothed', 'new_tests_smoothed_per_thousand', 'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'total_boosters', 'new_vaccinations', 'new_vaccinations_smoothed', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred', 'new_vaccinations_smoothed_per_million', 'new_people_vaccinated_smoothed', 'new_people_vaccinated_smoothed_per_hundred', 'stringency_index', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'handwashing_facilities', 'hospital_beds_per_thousand', 'life_expectancy', 'human_development_index', 'excess_mortality_cumulative_absolute', 'excess_mortality_cumulative', 'excess_mortality', 'excess_mortality_cumulative_per_million']
# NOW TO CHECK THE DATA IN THE 'covid_vaccinations' DATAFRAME
covid_vaccinations.head()
| iso_code | continent | location | date | new_tests | total_tests | total_tests_per_thousand | new_tests_per_thousand | new_tests_smoothed | new_tests_smoothed_per_thousand | ... | female_smokers | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2/24/2020 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 1 | AFG | Asia | Afghanistan | 2/25/2020 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 2 | AFG | Asia | Afghanistan | 2/26/2020 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 3 | AFG | Asia | Afghanistan | 2/27/2020 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
| 4 | AFG | Asia | Afghanistan | 2/28/2020 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN |
5 rows × 45 columns
# NOW TO CHECK FOR MISSING VALUES WITHIN THE COLUMNS
# CREATE A FUNCTION THAT WILL CHECK THE PERCENTAGE OF MISSING VALUES IN THE COLUMNS FOR THE DATAFRAME PASSED
def check_nulls(df):
for col in df.columns:
pct_missing = np.mean((df)[col].isnull())
print('{} - {}%'.format(col, pct_missing))
# NOW TO CHECK THE PERCENTAGE OF NULL VALUES IN THE 'covid_deaths' DATAFRAME
check_nulls(covid_deaths)
iso_code - 0.0% continent - 0.06361414865166838% location - 0.0% date - 0.0% population - 0.007121901288141458% total_cases - 0.05569932313104154% new_cases - 0.055742573543722564% new_cases_smoothed - 0.0632393117417662% total_deaths - 0.1373993526854902% new_deaths - 0.13598650587124353% new_deaths_smoothed - 0.0632393117417662% total_cases_per_million - 0.06054336935131589% new_cases_per_million - 0.06057941136188341% new_cases_smoothed_per_million - 0.06804010754935953% total_deaths_per_million - 0.14214248127617551% new_deaths_per_million - 0.14072963446192882% new_deaths_smoothed_per_million - 0.06804010754935953% reproduction_rate - 0.20992308634944892% icu_patients - 0.8777238749486401% icu_patients_per_million - 0.8777238749486401% hosp_patients - 0.8603300006487562% hosp_patients_per_million - 0.8603300006487562% weekly_icu_admissions - 0.9904921176122888% weekly_icu_admissions_per_million - 0.9904921176122888% weekly_hosp_admissions - 0.9840982649376113% weekly_hosp_admissions_per_million - 0.9840982649376113%
# NOW TO CHECK THE PERCENTAGE OF NULL VALUES IN THE 'covid_vaccinations' DATAFRAME
check_nulls(covid_vaccinations)
iso_code - 0.0% continent - 0.06361414865166838% location - 0.0% date - 0.0% new_tests - 0.5802259113222372% total_tests - 0.5793681114707303% total_tests_per_thousand - 0.5793681114707303% new_tests_per_thousand - 0.5802259113222372% new_tests_smoothed - 0.4945900942138156% new_tests_smoothed_per_thousand - 0.4945900942138156% positive_rate - 0.5240364168474774% tests_per_case - 0.5288011706445033% tests_units - 0.47958220101350135% total_vaccinations - 0.7364031515134041% people_vaccinated - 0.7480879713393932% people_fully_vaccinated - 0.769331132367888% total_boosters - 0.9332069460162765% new_vaccinations - 0.7806627404903155% new_vaccinations_smoothed - 0.528152414454288% total_vaccinations_per_hundred - 0.7364031515134041% people_vaccinated_per_hundred - 0.7480879713393932% people_fully_vaccinated_per_hundred - 0.769331132367888% total_boosters_per_hundred - 0.9332069460162765% new_vaccinations_smoothed_per_million - 0.528152414454288% new_people_vaccinated_smoothed - 0.536701579360903% new_people_vaccinated_smoothed_per_hundred - 0.536701579360903% stringency_index - 0.18451346889934908% population_density - 0.09491303062850058% median_age - 0.13806973408204604% aged_65_older - 0.14758482487187066% aged_70_older - 0.1427696122600503% gdp_per_capita - 0.13252647285676183% extreme_poverty - 0.4204516784764321% cardiovasc_death_rate - 0.1367578048973884% diabetes_prevalence - 0.10768631917362878% female_smokers - 0.3276435012650746% male_smokers - 0.3373460105098503% handwashing_facilities - 0.565491937402236% hospital_beds_per_thousand - 0.21571864164870574% life_expectancy - 0.07033237942145365% human_development_index - 0.13587837983954096% excess_mortality_cumulative_absolute - 0.9647220800565138% excess_mortality_cumulative - 0.9647220800565138% excess_mortality - 0.9647220800565138% excess_mortality_cumulative_per_million - 0.9647220800565138%
# FOR THE PURPOSES OF THE PROJECT, I WILL REMOVE THE ROWS IN THE 'continent' COLUMN FOR BOTH THE
# 'covid_deaths' AND 'covid_vaccinations' DATAFRAMES WHERE THERE ARE MISSING VALUES
# THIS IS TO HELP WITH OUTPUTS WHEN GROUPING BY 'continent' LATER ON IN THE PROJECT
covid_deaths = covid_deaths[covid_deaths['continent'].notna()]
covid_vaccinations = covid_vaccinations[covid_vaccinations['continent'].notna()]
# NOW TO CHECK THE PERCENTAGE OF MISSING VALUES IN THE 'covid_deaths' DATAFRAME TO SEE IF THE REMOVAL WORKED
check_nulls(covid_deaths)
iso_code - 0.0% continent - 0.0% location - 0.0% date - 0.0% population - 0.002440301150097766% total_cases - 0.05943711413219196% new_cases - 0.05952949146279503% new_cases_smoothed - 0.0669427722436914% total_deaths - 0.14509399393388864% new_deaths - 0.145124786377423% new_deaths_smoothed - 0.0669427722436914% total_cases_per_million - 0.05943711413219196% new_cases_per_million - 0.05952949146279503% new_cases_smoothed_per_million - 0.0669427722436914% total_deaths_per_million - 0.14509399393388864% new_deaths_per_million - 0.145124786377423% new_deaths_smoothed_per_million - 0.0669427722436914% reproduction_rate - 0.16150636633770074% icu_patients - 0.869416945081677% icu_patients_per_million - 0.869416945081677% hosp_patients - 0.8508414035195763% hosp_patients_per_million - 0.8508414035195763% weekly_icu_admissions - 0.9898461917445459% weekly_icu_admissions_per_million - 0.9898461917445459% weekly_hosp_admissions - 0.9830179673908023% weekly_hosp_admissions_per_million - 0.9830179673908023%
# NOW TO CHECK THE PERCENTAGE OF MISSING VALUES IN THE 'covid_vaccinations' DATAFRAME TO SEE IF THE REMOVAL WORKED
check_nulls(covid_vaccinations)
iso_code - 0.0% continent - 0.0% location - 0.0% date - 0.0% new_tests - 0.5517082108050685% total_tests - 0.5507921356099214% total_tests_per_thousand - 0.5507921356099214% new_tests_per_thousand - 0.5517082108050685% new_tests_smoothed - 0.46025465350802913% new_tests_smoothed_per_thousand - 0.46025465350802913% positive_rate - 0.49170143646749087% tests_per_case - 0.4967898877615433% tests_units - 0.4442271866483965% total_vaccinations - 0.7503117734907854% people_vaccinated - 0.7627904112330834% people_fully_vaccinated - 0.7841988576003449% total_boosters - 0.9500161660328555% new_vaccinations - 0.7973087404350973% new_vaccinations_smoothed - 0.5282828593863066% total_vaccinations_per_hundred - 0.7503117734907854% people_vaccinated_per_hundred - 0.7627904112330834% people_fully_vaccinated_per_hundred - 0.7841988576003449% total_boosters_per_hundred - 0.9500161660328555% new_vaccinations_smoothed_per_million - 0.5282828593863066% new_people_vaccinated_smoothed - 0.5374128188942433% new_people_vaccinated_smoothed_per_hundred - 0.5374128188942433% stringency_index - 0.12911271573955752% population_density - 0.03871379963356992% median_age - 0.08480238949361826% aged_65_older - 0.09496389585995596% aged_70_older - 0.0898215577897184% gdp_per_capita - 0.0788825422241382% extreme_poverty - 0.3863681852473403% cardiovasc_death_rate - 0.08340133331280504% diabetes_prevalence - 0.052354852119289925% female_smokers - 0.28725500762112977% male_smokers - 0.2976166648704408% handwashing_facilities - 0.5412618743360379% hospital_beds_per_thousand - 0.16772643993164077% life_expectancy - 0.012463241520530861% human_development_index - 0.08246216378500716% excess_mortality_cumulative_absolute - 0.9623254453357146% excess_mortality_cumulative - 0.9623254453357146% excess_mortality - 0.9623254453357146% excess_mortality_cumulative_per_million - 0.9623254453357146%
# CHECKING THE DATA TYPES OF BOTH THE 'covid_deaths' AND 'covid_vaccinations' DATAFRAMES, BOTH THE 'date' COLUMNS
# ARE OF object TYPE; I WILL CHANGE TO datatime DATA TYPE AND THE CREATE 3 NEW COLUMNS ('month', 'day', 'year') TO
# BOTH 'covid_deaths' AND 'covid_vaccinations' DATAFRAMES, AGAIN TO HELP WITH OUTPUT WHEN GROUPING BY LATER IN PROJECT
covid_deaths.dtypes
iso_code object continent object location object date object population float64 total_cases float64 new_cases float64 new_cases_smoothed float64 total_deaths float64 new_deaths float64 new_deaths_smoothed float64 total_cases_per_million float64 new_cases_per_million float64 new_cases_smoothed_per_million float64 total_deaths_per_million float64 new_deaths_per_million float64 new_deaths_smoothed_per_million float64 reproduction_rate float64 icu_patients float64 icu_patients_per_million float64 hosp_patients float64 hosp_patients_per_million float64 weekly_icu_admissions float64 weekly_icu_admissions_per_million float64 weekly_hosp_admissions float64 weekly_hosp_admissions_per_million float64 dtype: object
covid_vaccinations.dtypes
iso_code object continent object location object date object new_tests float64 total_tests float64 total_tests_per_thousand float64 new_tests_per_thousand float64 new_tests_smoothed float64 new_tests_smoothed_per_thousand float64 positive_rate float64 tests_per_case float64 tests_units object total_vaccinations float64 people_vaccinated float64 people_fully_vaccinated float64 total_boosters float64 new_vaccinations float64 new_vaccinations_smoothed float64 total_vaccinations_per_hundred float64 people_vaccinated_per_hundred float64 people_fully_vaccinated_per_hundred float64 total_boosters_per_hundred float64 new_vaccinations_smoothed_per_million float64 new_people_vaccinated_smoothed float64 new_people_vaccinated_smoothed_per_hundred float64 stringency_index float64 population_density float64 median_age float64 aged_65_older float64 aged_70_older float64 gdp_per_capita float64 extreme_poverty float64 cardiovasc_death_rate float64 diabetes_prevalence float64 female_smokers float64 male_smokers float64 handwashing_facilities float64 hospital_beds_per_thousand float64 life_expectancy float64 human_development_index float64 excess_mortality_cumulative_absolute float64 excess_mortality_cumulative float64 excess_mortality float64 excess_mortality_cumulative_per_million float64 dtype: object
# FIRST TO CHANGE THE 'date' COLUMN IN THE 'covid_deaths' DATAFRAME TO datetime
covid_deaths['date'] = pd.to_datetime(covid_deaths['date'])
# CHECKING TO SEE IF THE CHANGE WORKED
covid_deaths.dtypes
iso_code object continent object location object date datetime64[ns] population float64 total_cases float64 new_cases float64 new_cases_smoothed float64 total_deaths float64 new_deaths float64 new_deaths_smoothed float64 total_cases_per_million float64 new_cases_per_million float64 new_cases_smoothed_per_million float64 total_deaths_per_million float64 new_deaths_per_million float64 new_deaths_smoothed_per_million float64 reproduction_rate float64 icu_patients float64 icu_patients_per_million float64 hosp_patients float64 hosp_patients_per_million float64 weekly_icu_admissions float64 weekly_icu_admissions_per_million float64 weekly_hosp_admissions float64 weekly_hosp_admissions_per_million float64 dtype: object
# NOW TO ADD THE 'month', 'day', AND 'year' COLUMNS TO THE 'covid_deaths' DATAFRAME
covid_deaths['month'] = covid_deaths['date'].dt.month
covid_deaths['day'] = covid_deaths['date'].dt.day
covid_deaths['year'] = covid_deaths['date'].dt.year
# NOW TO SEE IF COLUMNS WERE SUCCESSFULLY ADDED
covid_deaths
| iso_code | continent | location | date | population | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | ... | icu_patients_per_million | hosp_patients | hosp_patients_per_million | weekly_icu_admissions | weekly_icu_admissions_per_million | weekly_hosp_admissions | weekly_hosp_admissions_per_million | month | day | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | 39835428.0 | 5.0 | 5.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 | 24 | 2020 |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 | 25 | 2020 |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 | 26 | 2020 |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 | 27 | 2020 |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | 39835428.0 | 5.0 | 0.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 | 28 | 2020 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 138722 | ZWE | Africa | Zimbabwe | 2021-12-04 | 15092171.0 | 138523.0 | 1082.0 | 669.571 | 4709.0 | 1.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12 | 4 | 2021 |
| 138723 | ZWE | Africa | Zimbabwe | 2021-12-05 | 15092171.0 | 139046.0 | 523.0 | 727.857 | 4710.0 | 1.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12 | 5 | 2021 |
| 138724 | ZWE | Africa | Zimbabwe | 2021-12-06 | 15092171.0 | 139046.0 | 0.0 | 688.571 | 4710.0 | 0.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12 | 6 | 2021 |
| 138725 | ZWE | Africa | Zimbabwe | 2021-12-07 | 15092171.0 | 141601.0 | 2555.0 | 996.571 | 4713.0 | 3.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12 | 7 | 2021 |
| 138726 | ZWE | Africa | Zimbabwe | 2021-12-08 | 15092171.0 | 150628.0 | 9027.0 | 2184.429 | 4720.0 | 7.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12 | 8 | 2021 |
129902 rows × 29 columns
# NOW TO CHANGE THE 'date' COLUMN IN THE 'covid_vaccinations' DATAFRAME TO datetime
covid_vaccinations['date'] = pd.to_datetime(covid_vaccinations['date'])
# CHECKING TO SEE IF THE CHANGE WORKED
covid_vaccinations.dtypes
iso_code object continent object location object date datetime64[ns] new_tests float64 total_tests float64 total_tests_per_thousand float64 new_tests_per_thousand float64 new_tests_smoothed float64 new_tests_smoothed_per_thousand float64 positive_rate float64 tests_per_case float64 tests_units object total_vaccinations float64 people_vaccinated float64 people_fully_vaccinated float64 total_boosters float64 new_vaccinations float64 new_vaccinations_smoothed float64 total_vaccinations_per_hundred float64 people_vaccinated_per_hundred float64 people_fully_vaccinated_per_hundred float64 total_boosters_per_hundred float64 new_vaccinations_smoothed_per_million float64 new_people_vaccinated_smoothed float64 new_people_vaccinated_smoothed_per_hundred float64 stringency_index float64 population_density float64 median_age float64 aged_65_older float64 aged_70_older float64 gdp_per_capita float64 extreme_poverty float64 cardiovasc_death_rate float64 diabetes_prevalence float64 female_smokers float64 male_smokers float64 handwashing_facilities float64 hospital_beds_per_thousand float64 life_expectancy float64 human_development_index float64 excess_mortality_cumulative_absolute float64 excess_mortality_cumulative float64 excess_mortality float64 excess_mortality_cumulative_per_million float64 dtype: object
# NOW TO ADD THE 'month', 'day', AND 'year' COLUMNS TO THE 'covid_deaths' DATAFRAME
covid_vaccinations['month'] = covid_vaccinations['date'].dt.month
covid_vaccinations['day'] = covid_vaccinations['date'].dt.day
covid_vaccinations['year'] = covid_vaccinations['date'].dt.year
# NOW TO SEE IF COLUMNS WERE SUCCESSFULLY ADDED
covid_vaccinations
| iso_code | continent | location | date | new_tests | total_tests | total_tests_per_thousand | new_tests_per_thousand | new_tests_smoothed | new_tests_smoothed_per_thousand | ... | hospital_beds_per_thousand | life_expectancy | human_development_index | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | month | day | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN | 2 | 24 | 2020 |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN | 2 | 25 | 2020 |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN | 2 | 26 | 2020 |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN | 2 | 27 | 2020 |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.5 | 64.83 | 0.511 | NaN | NaN | NaN | NaN | 2 | 28 | 2020 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 138722 | ZWE | Africa | Zimbabwe | 2021-12-04 | 7099.0 | 1485499.0 | 98.428 | 0.47 | 5204.0 | 0.345 | ... | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN | 12 | 4 | 2021 |
| 138723 | ZWE | Africa | Zimbabwe | 2021-12-05 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN | 12 | 5 | 2021 |
| 138724 | ZWE | Africa | Zimbabwe | 2021-12-06 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN | 12 | 6 | 2021 |
| 138725 | ZWE | Africa | Zimbabwe | 2021-12-07 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN | 12 | 7 | 2021 |
| 138726 | ZWE | Africa | Zimbabwe | 2021-12-08 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.7 | 61.49 | 0.571 | NaN | NaN | NaN | NaN | 12 | 8 | 2021 |
129902 rows × 48 columns
# NOW TO FIND GLOBAL NUMBERS WITHIN THE 'covid_death' DATAFRAME
# FIRST, FIND THE TOTAL NUMBER OF COVID CASES
total_covid_cases = covid_deaths['new_cases'].sum()
total_covid_cases
267229814.0
# NOW TO FIND THE TOTAL NUMBER OF COVID DEATHS
total_covid_deaths = covid_deaths['new_deaths'].sum()
total_covid_deaths
5262258.0
# NOW TO FIND THE DEATH PERCENTAGE
death_percentage = (total_covid_deaths/total_covid_cases)*100
death_percentage
1.9691882134079548
# NOW TO CREATE A 'global_numbers' DATAFRAME FROM THE 'total_covid_cases', 'total_covid_deaths' AND 'death_percentage' VARIABLES
data = [[total_covid_cases,total_covid_deaths,death_percentage]]
global_numbers = pd.DataFrame(data, columns=['total_covid_cases','total_covid_deaths','death_percentage'])
# NOW TO CHECK THE 'global_numbers' DATAFRAME
global_numbers
| total_covid_cases | total_covid_deaths | death_percentage | |
|---|---|---|---|
| 0 | 267229814.0 | 5262258.0 | 1.969188 |
# NOW TO CHECK THE TOTAL COVID CASES AND DEATHS BY CONTINENT
covid_numbers_continent = covid_deaths.groupby(['continent'])[['new_cases','new_deaths']].sum().reset_index()
covid_numbers_continent
| continent | new_cases | new_deaths | |
|---|---|---|---|
| 0 | Africa | 8784938.0 | 223923.0 |
| 1 | Asia | 81954793.0 | 1231854.0 |
| 2 | Europe | 78082793.0 | 1451244.0 |
| 3 | North America | 59140387.0 | 1179882.0 |
| 4 | Oceania | 324851.0 | 3397.0 |
| 5 | South America | 38942052.0 | 1171958.0 |
# NOW TO VISUALIZE THE TOTAL COVID CASES BY CONTINENT
continent_case_numbers = covid_numbers_continent.sort_values(by='new_cases', ascending=False)
continent_cases_graph = px.bar(continent_case_numbers, x='continent', y='new_cases', title="Total Covid Cases by Continent",labels={'new_cases':'Total Covid Cases', 'continent':'Continent'})
continent_cases_graph.show()
# NOW TO DO THE SAME WITH TOTAL COVID DEATHS BY CONTINENT
continent_death_numbers = covid_numbers_continent.sort_values(by='new_deaths', ascending=False)
continent_death_graph = px.bar(continent_death_numbers, x='continent', y='new_deaths', title="Total Covid Deaths by Continent", labels={'new_deaths':'Total Covid Deaths', 'continent':'Continent'})
continent_death_graph.show()
# CREATE DATAFRAME THAT SHOWS COUNTRIES WITH HIGHEST INFECTION RATE COMPARED TO POPULATION
# HIGHEST INFECTION RATE
highest_infection_rate = covid_deaths.groupby(['iso_code','location','population'])[['total_cases']].agg(['max']).reset_index()
highest_infection_rate
| iso_code | location | population | total_cases | |
|---|---|---|---|---|
| max | ||||
| 0 | ABW | Aruba | 107195.0 | NaN |
| 1 | AFG | Afghanistan | 39835428.0 | 157542.0 |
| 2 | AGO | Angola | 33933611.0 | 65332.0 |
| 3 | AIA | Anguilla | 15125.0 | NaN |
| 4 | ALB | Albania | 2872934.0 | 202641.0 |
| ... | ... | ... | ... | ... |
| 218 | WSM | Samoa | 200144.0 | 3.0 |
| 219 | YEM | Yemen | 30490639.0 | 10047.0 |
| 220 | ZAF | South Africa | 60041996.0 | 3071064.0 |
| 221 | ZMB | Zambia | 18920657.0 | 210436.0 |
| 222 | ZWE | Zimbabwe | 15092171.0 | 150628.0 |
223 rows × 4 columns
# RESETTING THE MULTI-INDEX OF THE 'highest_infection_rate' DATAFRAME TO A SINGLE INDEX
highest_infection_rate.columns = ["".join(a) for a in highest_infection_rate.columns.to_flat_index()]
# NOW TO FIND AND ADD COLUMN THAT SHOWS THE PERCENTAGE OF POPULATION INFECTED
highest_infection_rate['percent_pop_infected'] = (highest_infection_rate['total_casesmax']) / (highest_infection_rate['population']) * 100
# NOW TO CHECK THE 'highest_infection_rate' DATAFRAME
highest_infection_rate
| iso_code | location | population | total_casesmax | percent_pop_infected | |
|---|---|---|---|---|---|
| 0 | ABW | Aruba | 107195.0 | NaN | NaN |
| 1 | AFG | Afghanistan | 39835428.0 | 157542.0 | 0.395482 |
| 2 | AGO | Angola | 33933611.0 | 65332.0 | 0.192529 |
| 3 | AIA | Anguilla | 15125.0 | NaN | NaN |
| 4 | ALB | Albania | 2872934.0 | 202641.0 | 7.053451 |
| ... | ... | ... | ... | ... | ... |
| 218 | WSM | Samoa | 200144.0 | 3.0 | 0.001499 |
| 219 | YEM | Yemen | 30490639.0 | 10047.0 | 0.032951 |
| 220 | ZAF | South Africa | 60041996.0 | 3071064.0 | 5.114860 |
| 221 | ZMB | Zambia | 18920657.0 | 210436.0 | 1.112202 |
| 222 | ZWE | Zimbabwe | 15092171.0 | 150628.0 | 0.998054 |
223 rows × 5 columns
# NOW TO GRAPH THE PERCENTAGE OF POPULATION INFECTED BY COUNTRY
fig=px.choropleth(highest_infection_rate, locations='iso_code',color='percent_pop_infected', color_continuous_scale=px.colors.sequential.Sunsetdark)
fig.update_layout(title="Percent Population Infected per Country", height=1000)
fig.update_geos(projection_type="orthographic", showocean=True,oceancolor="deepskyblue", showlakes=True, lakecolor="lightskyblue", bgcolor="lightgray")
fig.show()
# NOW TO LOOK AT INFECTION RATE OVER TIME
# CREATE DATAFRAME THAT HIGHEST INFECTION RATE OVER TIME
# INFECTION RATE OVER TIME
infection_over_time = covid_deaths.groupby(['iso_code','location','population','date'])[['total_cases']].agg(['max']).reset_index()
infection_over_time
| iso_code | location | population | date | total_cases | |
|---|---|---|---|---|---|
| max | |||||
| 0 | ABW | Aruba | 107195.0 | 2021-03-29 | NaN |
| 1 | ABW | Aruba | 107195.0 | 2021-03-30 | NaN |
| 2 | ABW | Aruba | 107195.0 | 2021-03-31 | NaN |
| 3 | ABW | Aruba | 107195.0 | 2021-04-01 | NaN |
| 4 | ABW | Aruba | 107195.0 | 2021-04-02 | NaN |
| ... | ... | ... | ... | ... | ... |
| 129580 | ZWE | Zimbabwe | 15092171.0 | 2021-12-04 | 138523.0 |
| 129581 | ZWE | Zimbabwe | 15092171.0 | 2021-12-05 | 139046.0 |
| 129582 | ZWE | Zimbabwe | 15092171.0 | 2021-12-06 | 139046.0 |
| 129583 | ZWE | Zimbabwe | 15092171.0 | 2021-12-07 | 141601.0 |
| 129584 | ZWE | Zimbabwe | 15092171.0 | 2021-12-08 | 150628.0 |
129585 rows × 5 columns
# RESETTING THE MULTI-INDEX OF THE 'highest_infection_rate' DATAFRAME TO A SINGLE INDEX
infection_over_time.columns = ["".join(a) for a in infection_over_time.columns.to_flat_index()]
# NOW TO ADD THE 'percent_pop_infected' COLUMN TO THE 'infection_over_time' DATAFRAME
infection_over_time['percent_pop_infected'] = (infection_over_time['total_casesmax']) / (infection_over_time['population']) * 100
# NOW TO CHECK THE 'infection_over_time' DATAFRAME
infection_over_time
| iso_code | location | population | date | total_casesmax | percent_pop_infected | |
|---|---|---|---|---|---|---|
| 0 | ABW | Aruba | 107195.0 | 2021-03-29 | NaN | NaN |
| 1 | ABW | Aruba | 107195.0 | 2021-03-30 | NaN | NaN |
| 2 | ABW | Aruba | 107195.0 | 2021-03-31 | NaN | NaN |
| 3 | ABW | Aruba | 107195.0 | 2021-04-01 | NaN | NaN |
| 4 | ABW | Aruba | 107195.0 | 2021-04-02 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... |
| 129580 | ZWE | Zimbabwe | 15092171.0 | 2021-12-04 | 138523.0 | 0.917847 |
| 129581 | ZWE | Zimbabwe | 15092171.0 | 2021-12-05 | 139046.0 | 0.921312 |
| 129582 | ZWE | Zimbabwe | 15092171.0 | 2021-12-06 | 139046.0 | 0.921312 |
| 129583 | ZWE | Zimbabwe | 15092171.0 | 2021-12-07 | 141601.0 | 0.938241 |
| 129584 | ZWE | Zimbabwe | 15092171.0 | 2021-12-08 | 150628.0 | 0.998054 |
129585 rows × 6 columns
# NOW TO GRAPH THE COVID INFECTION RATE OVER TIME
fig = px.line(infection_over_time, x="date", y="percent_pop_infected", color="location")
fig.update_layout(title="Percent Population Infected over Time", xaxis_title=None, yaxis_title="Percent Population Infected", plot_bgcolor="lightgray")
fig.show()
# TO CREATE A CLEANER GRAPH I WILL CREATE A DATAFRAME TO GRAPH ONLY 7 LOCATIONS FROM THE 'infection_over_time' DATAFRAME
# (United States, Mexico, United Kingdom, UAE, India, Brazil, and South Africa) -
# NAMING THE DATAFRAME 'graph_locations'
graph_locations = infection_over_time.query('(location==["United States","Mexico","United Kingdom","United Arab Emirates","India","Brazil","South Africa"])')
# CHECKING THE 'graph_locations' DATAFRAME
graph_locations
| iso_code | location | population | date | total_casesmax | percent_pop_infected | |
|---|---|---|---|---|---|---|
| 3146 | ARE | United Arab Emirates | 9991083.0 | 2020-01-29 | 4.0 | 0.000040 |
| 3147 | ARE | United Arab Emirates | 9991083.0 | 2020-01-30 | 4.0 | 0.000040 |
| 3148 | ARE | United Arab Emirates | 9991083.0 | 2020-01-31 | 4.0 | 0.000040 |
| 3149 | ARE | United Arab Emirates | 9991083.0 | 2020-02-01 | 4.0 | 0.000040 |
| 3150 | ARE | United Arab Emirates | 9991083.0 | 2020-02-02 | 5.0 | 0.000050 |
| ... | ... | ... | ... | ... | ... | ... |
| 128320 | ZAF | South Africa | 60041996.0 | 2021-12-04 | 3020569.0 | 5.030760 |
| 128321 | ZAF | South Africa | 60041996.0 | 2021-12-05 | 3031694.0 | 5.049289 |
| 128322 | ZAF | South Africa | 60041996.0 | 2021-12-06 | 3038075.0 | 5.059917 |
| 128323 | ZAF | South Africa | 60041996.0 | 2021-12-07 | 3051222.0 | 5.081813 |
| 128324 | ZAF | South Africa | 60041996.0 | 2021-12-08 | 3071064.0 | 5.114860 |
4755 rows × 6 columns
# NOW TO GRAPH THE COVID INFECTION RATE OVER TIME FOR THE LOCATIONS IN THE 'graph_locations' DATAFRAME
fig = px.line(graph_locations, x="date", y="percent_pop_infected", color="location")
fig.update_layout(title="Percent Population Infected over Time", xaxis_title=None, yaxis_title="Percent Population Infected", plot_bgcolor="lightgray")
fig.show()